##load the dataset

CHECK THE HEAD OF THE BIKESHARE DF

head(bikeshare)
## # A tibble: 6 x 12
##   datetime            season holiday workingday weather  temp atemp humidity
##   <dttm>               <dbl>   <dbl>      <dbl>   <dbl> <dbl> <dbl>    <dbl>
## 1 2011-01-01 00:00:00      1       0          0       1  9.84  14.4       81
## 2 2011-01-01 01:00:00      1       0          0       1  9.02  13.6       80
## 3 2011-01-01 02:00:00      1       0          0       1  9.02  13.6       80
## 4 2011-01-01 03:00:00      1       0          0       1  9.84  14.4       75
## 5 2011-01-01 04:00:00      1       0          0       1  9.84  14.4       75
## 6 2011-01-01 05:00:00      1       0          0       2  9.84  12.9       75
## # … with 4 more variables: windspeed <dbl>, casual <dbl>, registered <dbl>,
## #   count <dbl>

THE TARGET VAR COUNT OF BIKE SHARE : 20TH -31TH OF EACH MONTH

CREATE A SCATTER PLOT OF COUNT VS TEMP

library(ggplot2)
library(ggthemes)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
plot<-ggplot(data=bikeshare, aes(x = temp, y = count))
plot + geom_point(aes(color= temp, alpha = 0.6))

plot count vs. datetime

#convert datetime into POSIXct before plotting 
bikeshare$DateTime<-as.POSIXct(paste(bikeshare$datetime, format="%Y%m%d %H%M%S"))
class(bikeshare$DateTime)
## [1] "POSIXct" "POSIXt"
# new plot 
plot2<-ggplot(data=bikeshare, aes(x = DateTime, y = count))
plot2 + geom_point(aes(color= temp, alpha = 0.6)) + 
                           scale_colour_gradient(high="pink", low="light green")

##CORRELATION BETWEEN TEMP AND COUNT

cor(bikeshare$temp, bikeshare$count, method = "pearson", use = "complete.obs")
## [1] 0.3944536

explore the season data, create a boxplot

plot3<- ggplot(bikeshare, aes(factor(season), count))
plot3 + geom_boxplot(aes(fill = factor(season)))

create an hour col that takes hour from the datetime col

bikeshare$hour<- sapply(bikeshare$DateTime, function(x) {format(x, "%H")})

head(bikeshare)
## # A tibble: 6 x 14
##   datetime            season holiday workingday weather  temp atemp humidity
##   <dttm>               <dbl>   <dbl>      <dbl>   <dbl> <dbl> <dbl>    <dbl>
## 1 2011-01-01 00:00:00      1       0          0       1  9.84  14.4       81
## 2 2011-01-01 01:00:00      1       0          0       1  9.02  13.6       80
## 3 2011-01-01 02:00:00      1       0          0       1  9.02  13.6       80
## 4 2011-01-01 03:00:00      1       0          0       1  9.84  14.4       75
## 5 2011-01-01 04:00:00      1       0          0       1  9.84  14.4       75
## 6 2011-01-01 05:00:00      1       0          0       2  9.84  12.9       75
## # … with 6 more variables: windspeed <dbl>, casual <dbl>, registered <dbl>,
## #   count <dbl>, DateTime <dttm>, hour <chr>

count vs. hour

#subsetting to workingday==1
bike_w1<-bikeshare [ which(bikeshare$workingday == 1), ]


plot4<-ggplot(data=bike_w1, aes(x = hour, y = count))

plot4 + geom_point(aes(color= temp, alpha = 0.6), position =position_jitter(w=1, h=0)) + 
                           scale_colour_gradient(high="orange", low="dark blue")

ggplot for non-working days

bike_wk<-bikeshare [ which(bikeshare$workingday == 0), ]
plot5<-ggplot(data=bike_wk, aes(x = hour, y = count))

plot5 + geom_point(aes(color= temp, alpha = 0.6), position =position_jitter(w=1, h=0)) + 
                           scale_colour_gradient(high="dark blue", low="green")

TRAIN AND TEST DATA

library(caTools)

set.seed(101)

sample<-sample.split(bikeshare$count, SplitRatio = 0.7)

train = subset (bikeshare, sample == TRUE)
test  = subset (bikeshare, sample == FALSE)

BUILD LINEAR REGRESSION Training MODEL

# predict count based sorely on the temp feature 
train.model<-lm(formula = count ~ factor(season) + workingday + holiday + hour + temp + casual, data = train)

summary(train.model)
## 
## Call:
## lm(formula = count ~ factor(season) + workingday + holiday + 
##     hour + temp + casual, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -382.92  -45.23   -9.58   43.14  449.95 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -73.26322    5.83746 -12.551  < 2e-16 ***
## factor(season)2  12.15086    3.64732   3.331 0.000868 ***
## factor(season)3  10.61185    4.63163   2.291 0.021981 *  
## factor(season)4  50.20128    2.92276  17.176  < 2e-16 ***
## workingday       80.85236    2.42565  33.332  < 2e-16 ***
## holiday          24.70403    6.13691   4.025 5.74e-05 ***
## hour01          -14.22644    6.76528  -2.103 0.035511 *  
## hour02          -20.08635    6.86078  -2.928 0.003425 ** 
## hour03          -28.38097    6.88698  -4.121 3.81e-05 ***
## hour04          -29.15497    6.82029  -4.275 1.94e-05 ***
## hour05          -15.63559    6.77813  -2.307 0.021094 *  
## hour06           33.87405    6.76304   5.009 5.60e-07 ***
## hour07          157.48062    6.74992  23.331  < 2e-16 ***
## hour08          295.91529    6.82217  43.376  < 2e-16 ***
## hour09          120.01776    6.83367  17.563  < 2e-16 ***
## hour10           35.84389    6.94032   5.165 2.47e-07 ***
## hour11           40.97710    6.92076   5.921 3.34e-09 ***
## hour12           64.30936    7.01301   9.170  < 2e-16 ***
## hour13           52.53715    6.99093   7.515 6.34e-14 ***
## hour14           32.55945    7.13710   4.562 5.15e-06 ***
## hour15           43.00719    7.03676   6.112 1.03e-09 ***
## hour16          109.16193    7.03633  15.514  < 2e-16 ***
## hour17          268.08535    7.07382  37.898  < 2e-16 ***
## hour18          259.23718    6.97918  37.144  < 2e-16 ***
## hour19          164.64639    6.78447  24.268  < 2e-16 ***
## hour20          105.46635    6.70103  15.739  < 2e-16 ***
## hour21           72.99693    6.78025  10.766  < 2e-16 ***
## hour22           49.27957    6.75445   7.296 3.27e-13 ***
## hour23           21.97105    6.82064   3.221 0.001282 ** 
## temp              1.80277    0.23559   7.652 2.22e-14 ***
## casual            2.20688    0.02894  76.261  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 85.83 on 7606 degrees of freedom
## Multiple R-squared:  0.781,  Adjusted R-squared:  0.7802 
## F-statistic: 904.2 on 30 and 7606 DF,  p-value: < 2.2e-16

VISUALIZE THE MODEL

res<-residuals(train.model)
res<- as.data.frame(res)

##GGPLOT

ggplot(res, aes(res) )+ geom_histogram(fill = 'blue', alpha=0.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot(train.model)

##PREDICTIONS

test.predictions<- predict(train.model, test)

##MSE

results<-cbind(test.predictions, test$count)
colnames(results)<-c('pred', 'real')
results<- as.data.frame(results)

#calcuate MSE
mse <-mean((results$real-results$pred)^2)
print(mse)
## [1] 7181.107
mse^0.5
## [1] 84.74141

R-square

SSE =sum((results$pred - results$real)^2)
SST = sum( (mean(bikeshare$count)- results$real)^2)

R2 = 1 - SSE/SST

print(R2)
## [1] 0.7696607